import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from boss.items import BossItem


class ZhipinSpider(CrawlSpider):
    name = 'zhipin'
    # allowed_domains = ['careers.tencent.com']
    start_urls = ['https://www.zhipin.com/job_detail/bbe8e8fc68fe39961nZy29W5GVZU.html']

    # boss直聘需要在headers 加上cookie才能抓取，否则抓取不到数据
    rules = (
        # https://www.zhipin.com/c101280600/?query=seo&page=1
        Rule(LinkExtractor(allow=r'.+job_detail.+.html'), follow=False,callback='parse_job'),
        # https://www.zhipin.com/job_detail/bbe8e8fc68fe39961nZy29W5GVZU.html
        Rule(LinkExtractor(allow=r'.+jobdesc.html?postId=\d+'), callback='parse_job', follow=False)
    )



    def parse_job(self, response):
        print('='*30)
        title = response.xpath('//div[@class="work-title xh-highlight]/text()').get()
        # money = response.xpath('//div[@class="name"]/span/text()').get()
        item = BossItem(title=title)
        yield item
